import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import random
import pandas_profiling
import vtreat
import category_encoders as ce
from catboost import CatBoostRegressor
from catboost import Pool
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import feature_engine
from feature_engine.categorical_encoders import OneHotCategoricalEncoder
from feature_engine.categorical_encoders import RareLabelCategoricalEncoder
from feature_engine.categorical_encoders import MeanCategoricalEncoder
from feature_engine.categorical_encoders import OrdinalCategoricalEncoder
from feature_engine.categorical_encoders import CountFrequencyCategoricalEncoder
df = pd.read_csv('./allstate.csv')
df = df[['cat108', 'cat109', 'cat110', 'cat111', 'cat112', 'cat113', 'cat114', 'cat15', 'cat116',
'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9',
'loss']]
df = df[0:40000]
df.head()
df.shape
%%time
pandas_profiling.ProfileReport(df)
## Test / Train
catColumns = ['cat108', 'cat109', 'cat110', 'cat111', 'cat112', 'cat113', 'cat114', 'cat15', 'cat116']
numColumns = ['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9']
X = df.drop('loss', axis=1)
y = df['loss']
# Seed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=251)
# Drop rows where we have an unseen level (for convenience)
# Normally, you would want to capture these into an "unknown / unseen" category
for col in catColumns:
y_test = y_test[X_test[col].isin(X_train[col])]
X_test = X_test[X_test[col].isin(X_train[col])]
def evaluate(truth, pred):
mae = mean_absolute_error(truth, pred)
mse = mean_squared_error(truth, pred)
rmse = math.sqrt(mse)
return (mae, mse, rmse)
resultsDF = pd.DataFrame(columns=['Estimator', 'Encoding', 'MAE', 'MSE', 'RMSE'])
shapeDF = pd.DataFrame(columns=['Original', 'Final'])
ohe = OneHotCategoricalEncoder(variables = catColumns, drop_last=True)
sample = Pipeline(steps=[('ohe', ohe)]).fit_transform(X_train, y_train)
shapeDF.loc['One Hot'] = [X.shape[1], sample.shape[1]]
sample.head()
%%time
est = LinearRegression()
ohe = OneHotCategoricalEncoder(variables = catColumns, drop_last=True)
pipe = Pipeline(steps=[('ohe', ohe), ('est', est)], verbose = True)
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
mae, mse, rmse = evaluate(y_test, y_hat)
#Skip Recording due to instability
#resultsDF.loc['Linear:One Hot'] = ['Linear', 'One Hot', mae, mse, rmse]
print('MAE {:,.2f}'.format(mae))
print('RMSE {:,.2f}'.format(rmse))
est = RandomForestRegressor(n_estimators=100)
ohe = OneHotCategoricalEncoder(variables = catColumns, drop_last=False)
pipe = Pipeline(steps=[('ohe', ohe), ('est', est)], verbose = True)
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
mae, mse, rmse = evaluate(y_test, y_hat)
resultsDF.loc['RF:One Hot'] = ['Random Forest', 'One Hot', mae, mse, rmse]
print('MAE {:,.2f}'.format(mae))
print('RMSE {:,.2f}'.format(rmse))
rare = RareLabelCategoricalEncoder(variables = catColumns)
sample = Pipeline(steps=[('rare', rare)]).fit_transform(X, y)
sample.head()
rare = RareLabelCategoricalEncoder(variables = catColumns)
ohe = OneHotCategoricalEncoder(variables = catColumns)
sample = Pipeline(steps=[('rare', rare), ('ohe', ohe)]).fit_transform(X, y)
shapeDF.loc['Rare'] = [X.shape[1], sample.shape[1]]
sample.head()
%%time
est = LinearRegression()
pipe = Pipeline(steps=[('rare', rare), ('ohe', ohe), ('est', est)], verbose = True)
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
mae, mse, rmse = evaluate(y_test, y_hat)
resultsDF.loc['Linear:Rare'] = ['Linear', 'Rare', mae, mse, rmse]
print('MAE {:,.2f}'.format(mae))
print('RMSE {:,.2f}'.format(rmse))
%%time
est = RandomForestRegressor(n_estimators=100)
pipe = Pipeline(steps=[('rare', rare), ('ohe', ohe), ('est', est)], verbose = True)
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
mae, mse, rmse = evaluate(y_test, y_hat)
resultsDF.loc['RF:Rare'] = ['Random Forest', 'Rare', mae, mse, rmse]
print('MAE {:,.2f}'.format(mae))
print('RMSE {:,.2f}'.format(rmse))
ord = OrdinalCategoricalEncoder(variables = catColumns)
sample = Pipeline(steps=[('ord', ord)]).fit_transform(X, y)
shapeDF.loc['Ordinal'] = [X.shape[1], sample.shape[1]]
sample.head()
%%time
est = LinearRegression()
pipe = Pipeline(steps=[('ord', ord), ('est', est)], verbose = True)
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
mae, mse, rmse = evaluate(y_test, y_hat)
resultsDF.loc['Linear:Ordinal'] = ['Linear', 'Ordinal', mae, mse, rmse]
print('MAE {:,.2f}'.format(mae))
print('RMSE {:,.2f}'.format(rmse))
%%time
est = RandomForestRegressor(n_estimators=100)
pipe = Pipeline(steps=[('ord', ord), ('est', est)], verbose = True)
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
mae, mse, rmse = evaluate(y_test, y_hat)
resultsDF.loc['RF:Ordinal'] = ['Random Forest', 'Ordinal', mae, mse, rmse]
print('MAE {:,.2f}'.format(mae))
print('RMSE {:,.2f}'.format(rmse))
meanE = MeanCategoricalEncoder(variables = catColumns)
sample = Pipeline(steps=[('meanE', meanE)]).fit_transform(X, y)
shapeDF.loc['Mean'] = [X.shape[1], sample.shape[1]]
sample.head()
%%time
est = LinearRegression()
pipe = Pipeline(steps=[('meanE', meanE), ('est', est)], verbose = True)
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
mae, mse, rmse = evaluate(y_test, y_hat)
resultsDF.loc['Linear:Mean'] = ['Linear', 'Mean', mae, mse, rmse]
print('MAE {:,.2f}'.format(mae))
print('RMSE {:,.2f}'.format(rmse))
%%time
est = RandomForestRegressor(n_estimators=100)
pipe = Pipeline(steps=[('meanE', meanE), ('est', est)], verbose = True)
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
mae, mse, rmse = evaluate(y_test, y_hat)
resultsDF.loc['RF:Mean'] = ['Random Forest', 'Mean', mae, mse, rmse]
print('MAE {:,.2f}'.format(mae))
print('RMSE {:,.2f}'.format(rmse))
freq = CountFrequencyCategoricalEncoder(variables = catColumns)
sample = Pipeline(steps=[('freq', freq)]).fit_transform(X, y)
shapeDF.loc['Count'] = [X.shape[1], sample.shape[1]]
sample.head()
%%time
est = LinearRegression()
pipe = Pipeline(steps=[('freq', freq), ('est', est)], verbose = True)
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
mae, mse, rmse = evaluate(y_test, y_hat)
resultsDF.loc['Linear:Freq'] = ['Linear', 'Freq', mae, mse, rmse]
print('MAE {:,.2f}'.format(mae))
print('RMSE {:,.2f}'.format(rmse))
%%time
est = RandomForestRegressor(n_estimators=100)
pipe = Pipeline(steps=[('freq', freq), ('est', est)], verbose = True)
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
mae, mse, rmse = evaluate(y_test, y_hat)
resultsDF.loc['RF:Freq'] = ['Random Forest', 'Freq', mae, mse, rmse]
print('MAE {:,.2f}'.format(mae))
print('RMSE {:,.2f}'.format(rmse))
X_train_num = X_train[numColumns]
X_train_cat = X_train[catColumns]
X_test_num = X_test[numColumns]
X_test_cat = X_test[catColumns]
ohe = OneHotCategoricalEncoder(variables=catColumns, drop_last=True)
X_train_ohe = ohe.fit_transform(X_train_cat, y_train).add_prefix('ohe_')
X_test_ohe = ohe.transform(X_test_cat).add_prefix('ohe_')
rare = RareLabelCategoricalEncoder(variables=catColumns)
ohe = OneHotCategoricalEncoder(variables=catColumns)
rarePipeline = Pipeline(steps=[('rare', rare), ('ohe', ohe)])
X_train_rare = rarePipeline.fit_transform(X_train_cat, y_train).add_prefix('rare_')
X_test_rare = rarePipeline.transform(X_test_cat).add_prefix('rare_')
ordinal = OrdinalCategoricalEncoder(variables=catColumns)
X_train_ordinal = ordinal.fit_transform(X_train_cat, y_train).add_prefix('ordinal_')
X_test_ordinal = ordinal.transform(X_test_cat).add_prefix('ordinal_')
meanE = MeanCategoricalEncoder(variables=catColumns)
X_train_mean = meanE.fit_transform(X_train_cat, y_train).add_prefix('mean_')
X_test_mean = meanE.transform(X_test_cat).add_prefix('mean_')
freq = CountFrequencyCategoricalEncoder(variables=catColumns)
X_train_freq = freq.fit_transform(X_train_cat, y_train).add_prefix('freq_')
X_test_freq = freq.transform(X_test_cat).add_prefix('freq_')
X_train_combo = pd.concat([X_train_num,
X_train_ohe,
X_train_rare,
X_train_ordinal,
X_train_mean,
X_train_freq], axis=1)
X_test_combo = pd.concat([X_test_num,
X_test_ohe,
X_test_rare,
X_test_ordinal,
X_test_mean,
X_test_freq], axis=1)
shapeDF.loc['Combination'] = [X.shape[1], X_train_combo.shape[1]]
X_train_combo.head()
%%time
est = LinearRegression()
pipe = Pipeline(steps=[('est', est)], verbose = True)
pipe.fit(X_train_combo, y_train)
y_hat = pipe.predict(X_test_combo)
mae, mse, rmse = evaluate(y_test, y_hat)
# Ignoring due to instability
# resultsDF.loc['Linear:Combo'] = ['Linear', 'Combo', mae, mse, rmse]
print('MAE {:,.2f}'.format(mae))
print('RMSE {:,.2f}'.format(rmse))
%%time
est = RandomForestRegressor(n_estimators=100)
pipe = Pipeline(steps=[('est', est)], verbose = True)
pipe.fit(X_train_combo, y_train)
y_hat = pipe.predict(X_test_combo)
mae, mse, rmse = evaluate(y_test, y_hat)
resultsDF.loc['RF:Combination'] = ['Random Forest', 'Combo', mae, mse, rmse]
print('MAE {:,.2f}'.format(mae))
print('RMSE {:,.2f}'.format(rmse))
resultsDF[resultsDF.Estimator == 'Linear'].sort_values('MAE')
resultsDF[resultsDF.Estimator == 'Random Forest'].sort_values('MAE')
he = ce.HashingEncoder(cols=catColumns, n_components=8)
sample = he.fit_transform(X_train)
sample.head()
%%time
est = LinearRegression()
pipe = Pipeline(steps=[('he', he), ('est', est)], verbose = True)
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
mae, mse, rmse = evaluate(y_test, y_hat)
resultsDF.loc['Linear:Hash'] = ['Linear', 'Hash', mae, mse, rmse]
print('MAE {:,.2f}'.format(mae))
print('RMSE {:,.2f}'.format(rmse))
%%time
est = RandomForestRegressor(n_estimators=100)
pipe = Pipeline(steps=[('he', he), ('est', est)], verbose = True)
pipe.fit(X_train, y_train)
y_hat = pipe.predict(X_test)
mae, mse, rmse = evaluate(y_test, y_hat)
resultsDF.loc['RF:Hash'] = ['RF', 'Hash', mae, mse, rmse]
print('MAE {:,.2f}'.format(mae))
print('RMSE {:,.2f}'.format(rmse))
plan = vtreat.NumericOutcomeTreatment()
X_train_vtreat = plan.fit_transform(X_train, y_train)
X_test_vtreat = plan.transform(X_test)
shapeDF.loc['VTreat'] = [X.shape[1], X_train_vtreat.shape[1]]
X_train_vtreat.head()
%%time
est = LinearRegression()
pipe = Pipeline(steps=[('est', est)], verbose = True)
pipe.fit(X_train_vtreat, y_train)
y_hat = pipe.predict(X_test_vtreat)
mae, mse, rmse = evaluate(y_test, y_hat)
resultsDF.loc['Linear:VTreat'] = ['Linear', 'VTreat', mae, mse, rmse]
print('MAE {:,.2f}'.format(mae))
print('RMSE {:,.2f}'.format(rmse))
%%time
est = RandomForestRegressor(n_estimators=100)
pipe = Pipeline(steps=[('est', est)], verbose = True)
pipe.fit(X_train_vtreat, y_train)
y_hat = pipe.predict(X_test_vtreat)
mae, mse, rmse = evaluate(y_test, y_hat)
resultsDF.loc['RF:VTreat'] = ['Random Forest', 'VTreat', mae, mse, rmse]
print('MAE {:,.2f}'.format(mae))
print('RMSE {:,.2f}'.format(rmse))
cat_idx = X_train.columns.get_indexer(X_train.select_dtypes('object').columns)
X_cb_train, X_cb_val, y_cb_train, y_cb_val = train_test_split(X_train, y_train, test_size=0.1)
train_pool = Pool(X_cb_train, y_cb_train, cat_features=cat_idx)
val_pool = Pool(X_cb_val, y_cb_val, cat_features=cat_idx)
test_pool = Pool(X_test, cat_features=cat_idx)
shapeDF.loc['CatBoost:None'] = [X.shape[1], X_cb_train.shape[1]]
# specify the training parameters
model = CatBoostRegressor(iterations=500,
depth=6,
learning_rate=0.1,
early_stopping_rounds=5,
loss_function='RMSE')
#train the model
model.fit(train_pool, eval_set=val_pool)
y_hat = model.predict(test_pool)
mae, mse, rmse = evaluate(y_test, y_hat)
resultsDF.loc['Catboost:None'] = ['CatBoost', 'None', mae, mse, rmse]
print('MAE {:,.2f}'.format(mae))
print('RMSE {:,.2f}'.format(rmse))
X_train_treated = X_train_vtreat.to_sparse().to_dense()
X_test_treated = X_test_vtreat.to_sparse().to_dense()
cat_idx = X_train_treated.columns.get_indexer(X_train_treated.select_dtypes('object').columns)
X_cb_train, X_cb_val, y_cb_train, y_cb_val = train_test_split(X_train_treated, y_train, test_size=0.1)
train_pool = Pool(X_cb_train, y_cb_train, cat_features=cat_idx)
val_pool = Pool(X_cb_val, y_cb_val, cat_features=cat_idx)
test_pool = Pool(X_test_treated, cat_features=cat_idx)
shapeDF.loc['CatBoost:VTreat'] = [X.shape[1], X_train_treated.shape[1]]
# specify the training parameters
model = CatBoostRegressor(iterations=500,
depth=6,
learning_rate=0.1,
early_stopping_rounds=5,
loss_function='RMSE')
#train the model
model.fit(train_pool, eval_set=val_pool)
y_hat = model.predict(test_pool)
mae, mse, rmse = evaluate(y_test, y_hat)
resultsDF.loc['Catboost:VTreat'] = ['CatBoost', 'VTreat', mae, mse, rmse]
print('MAE {:,.2f}'.format(mae))
print('RMSE {:,.2f}'.format(rmse))
shapeDF
plt.figure(figsize=(15,10))
shapeDF['Final'].sort_values(ascending=False).plot.barh(color='#1f77b4')
plt.title('Columns After Transformation')
resultsDF.sort_values('MAE')
plt.figure(figsize=(15,10))
resultsDF['MAE'].sort_values(ascending=False).plot.barh(color='#1f77b4')
plt.title('MAE by Model & Treatment')
resultsDF.sort_values('RMSE')
plt.figure(figsize=(15,10))
resultsDF['RMSE'].sort_values(ascending=False).plot.barh(color='#1f77b4')
plt.title('RMSE by Model & Treatment')
mae_rf = resultsDF.loc['RF:One Hot', 'MAE']
mae_vt = resultsDF.loc['RF:VTreat', 'MAE']
improvement = (mae_rf - mae_vt) * 100.0 / mae_rf
print('VTreat Improvement over Random Forest with OneHot treatement : {:.2f}%'.format(improvement))
mae_rf = resultsDF.loc['RF:One Hot', 'MAE']
mae_cat = resultsDF.loc['Catboost:VTreat', 'MAE']
improvement = (mae_rf - mae_cat) * 100.0 / mae_cat
print('CatBoost + VTreat Improvement over Random Forest with OneHot treatement : {:.2f}%'.format(improvement))